1 - Cellxgene Census

Author

CDN team

Published

March 5, 2024

Libraries

Installation

install.packages('plotly')
install.packages('colorBlindness')
devtools::install_github('jo-m-lab/ARBOL') # ARBOL is used to plot clusters.
devtools::install_github('immunogenomics/presto') # presto is used to speed up Wilcoxon tests for marker gene calculation in Seurat
suppressPackageStartupMessages({
  library(Seurat)
  library(tidyverse)
  library(ARBOL)
  library(plotly)
  library(colorBlindness)
  library(RColorBrewer)
  library(vegan)
  library(cluster)
})
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by 'BiocGenerics'
Warning: package 'glmGamPoi' was built under R version 4.3.2
set.seed(687)

Load data

srobj <- readRDS('/Users/kylekimler/Downloads/d8e35450-de43-451a-9979-276eac688bce.rds')
genes <- read_csv('/Users/kylekimler/CDN/workshops/workshop1/data/cov_flu_gene_names_table.csv')
Rows: 33234 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): index, feature_name

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Need to remake seurat object
mtx <- srobj@assays$RNA@data
rownames(mtx) <- genes[match(row.names(mtx),genes$index),]$feature_name

srobj <- CreateSeuratObject(counts = mtx, meta.data = srobj@meta.data)
Warning: Feature names cannot have underscores ('_'), replacing with dashes
('-')
srobj
An object of class Seurat 
33234 features across 59572 samples within 1 assay 
Active assay: RNA (33234 features, 0 variable features)
 1 layer present: counts

Generate a color palette for our cell types

# https://www.datanovia.com/en/blog/easy-way-to-expand-color-palettes-in-r/
# nb.cols <- length(unique(se$Celltype))
# mycolors <- colorRampPalette(paletteMartin)(nb.cols)
pal <- paletteMartin
names(pal) <- sort(unique(srobj$Celltype))

Seurat pre-processing for cluster visualization (UMAP)

srobj <- srobj %>%
    NormalizeData(verbose = FALSE) %>%
    FindVariableFeatures(
        method = "vst",
        nfeatures = 3000,
        verbose = FALSE) %>%
    ScaleData(verbose = FALSE) %>%
    RunPCA(verbose = FALSE) %>%
    RunUMAP(dims = 1:30, verbose = FALSE, n.components=3L)
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by 'BiocGenerics'
Found more than one class "dist" in cache; using the first, from namespace 'spam'
Also defined by 'BiocGenerics'

Viewing clusters

Typically in scRNA analysis, clusters are viewed by coloring cells on a dimension-reduced latent space of gene expression, like a tSNE or a UMAP. In the dataset we’ve downloaded, we have a tSNE already calculated, so let’s display the authors’ celltypes there first.

By plotting many tSNE’s per sample or per group, we can start to understand how clusters behave across samples.

DimPlot(srobj, 
        reduction='umap', 
        group.by='Celltype',
        cols = pal)

DimPlot(srobj, 
        reduction='umap', 
        group.by='Celltype', 
        split.by='Sample ID',
        ncol=4,
        cols = pal)

For example, we can see the sample from flu patient 5 is dominated by erythrocytes, and these erythrocytes aren’t reliably found across samples, whether diseased or normal. But that doesn’t tell us much about the clusters themselves.

And these celltypes are often validated by plotting heatmaps, dotplots, or violin plots of genes that are most important to each cluster, found by 1 v all differential expression.

Idents(srobj) <- srobj@meta.data$Celltype
celltype_markers <- FindAllMarkers(srobj, 
                        only.pos=TRUE, 
                        logfc.threshold=0.25,
                        min.diff.pct=0.05,
                        )
Calculating cluster B cell, IgG+
Calculating cluster B cell, IgG-
Calculating cluster CD4, EM-like
Calculating cluster CD4, non-EM-like
Calculating cluster CD8, EM-like
Calculating cluster CD8, non-EM-like
Calculating cluster DC
Calculating cluster NK cell
Calculating cluster Platelet
Calculating cluster RBC
Calculating cluster Uncategorized1
Calculating cluster Uncategorized2
Calculating cluster classical Monocyte
Calculating cluster intermediate Monocyte
Calculating cluster nonclassical Monocyte
top_cluster_features <- celltype_markers %>% group_by(cluster) %>% 
                                        filter(p_val_adj==0) %>%
                                        slice_max(avg_log2FC,n=10)

DoHeatmap(srobj, features=top_cluster_features$gene)
Warning in DoHeatmap(srobj, features = top_cluster_features$gene): The
following features were omitted as they were not found in the scale.data slot
for the RNA assay: WLS, CLEC2L, RP11-501J20.5, KIR3DL1, RNF165, BNC2, LGALS9B,
KIF19, MSC, RP11-23P13.6, CHRM3-AS2, FAM153A, LINC00402, TCEA3, NOG, CCR4,
WNT7A, PARP11-AS1, PI16, ABCB4, SCN3A, TCL1B, COL4A4, PLEKHG7

And there are quite a few methods for displaying genes across clusters detailed in Seurat vignettes

Beyond the norm

As we can see, the cell types are somewhat well defined but as usual there are some fuzzy boundaries. Some people like to go 3d to see if these boundaries are resolved.

emb <- Seurat::Embeddings(srobj,reduction='umap')
emb <- emb %>% as.data.frame %>% rownames_to_column('CellID') %>% 
left_join(srobj@meta.data %>% rownames_to_column("CellID"))
Joining with `by = join_by(CellID)`
suppressMessages({
p <- plot_ly(emb, type='scatter3d', 
                color = ~Celltype, 
                x = ~umap_1, 
                y = ~umap_2, 
                z = ~umap_3,
                colors = pal)
p
    })
No scatter3d mode specifed:
  Setting the mode to markers
  Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

But even with 3d we don’t have a way to quantify how useful our clusters are as labels in our dataset. Firstly, with a UMAP or tSNE, we can’t see how well represented samples are across clusters. Some people build cluster-composition bar graphs for this, but these can be difficult to compare, and they don’t show us how large the clusters are. One solution is to make a plot of cluster metrics per cluster. Let’s start with cell counts and sample diversity.

Simpson’s diversity

diversityPerGroup <- function(df, species, group, diversity_metric = 'simpson') {
  # Convert strings to symbols for curly-curly operator
  species_sym <- rlang::sym(species)
  group_sym <- rlang::sym(group)
  # Count groups per species directly using curly-curly
  tierNcount <- df %>%
    group_by({{species_sym}}) %>%
    count({{group_sym}}, name = "n") %>% ungroup
  # Pivot table to allow vegan::diversity call
  tierNwide <- tierNcount %>%
    pivot_wider(names_from = {{group_sym}}, values_from = n, values_fill = list(n = 0))
  # Use rownames of species for the diversity function, which needs a dataframe
  tierNwide_df <- as.data.frame(tierNwide)
  # species column must be first
  tierNwide_df <- tierNwide_df %>% select({{species}}, everything())
  rownames(tierNwide_df) <- tierNwide_df[, 1]
  tierNwide_df <- tierNwide_df[, -1]
  # Calculate diversity
  diversity_values <- vegan::diversity(tierNwide_df, index = diversity_metric)
  # Prepare result as a dataframe
  result <- data.frame(
    species = rownames(tierNwide_df),
    diversity = diversity_values,
    row.names = NULL
  )
  # Rename diversity column
  names(result)[1] <- species
  names(result)[2] <- sprintf('%s_diversity', group)
  return(result)
}

# Calculate simpson's diversity per cluster
clusterMetrics <- diversityPerGroup(srobj@meta.data,
                        species = 'Celltype',
                        group = 'Sample ID',
                        diversity_metric = 'simpson')

# Calculate number of cells per cluster and join to metrics table
clusterMetrics <- clusterMetrics %>% left_join(srobj@meta.data %>% count(Celltype))
Joining with `by = join_by(Celltype)`
# clusterMetrics

# p1 <- ggplot(clusterMetrics, aes(x = Celltype, y = n)) +
#   geom_bar(stat = "identity", fill = 'black') +
#   scale_y_log10() +
#   labs(x = "Cell Type", y = "Cell Number (log scale)") +
#   theme_minimal() +
#   theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2 <- ggplot(clusterMetrics, aes(y=Celltype, fill=`Sample ID_diversity`, x = 1, label = n)) +
  geom_tile(colour = "white") +
  geom_text(nudge_x = 1, size = 6) +
  geom_text(aes(label = signif(`Sample ID_diversity`, 3)),size = 6) +
  scale_fill_distiller(palette = "Spectral", limits = c(0,1)) + theme_classic() +
  coord_fixed(ratio = 1) + 
  expand_limits(x = c(0.5,3)) +
  labs(x = "") +
  theme(axis.text.y = element_text(hjust = 1, vjust = 1, size = 20),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.y = element_text(size = 20),
        legend.key.size = unit(1, 'cm'),
        legend.title = element_text(size=10), 
        legend.text = element_text(size=10)
  )
p2

So in this paper it looks like our clusters are in fact well represented among every sample. None of them are particularly small, but the classical Monocyte cluster contains > 15000 cells. There may be some additional clusters hiding there.

In addition to cell numbers and sample representation, it can be useful to get an idea of how tightly clustered cells are in each cluster, and how distant each cluster is to the others. This way we can quantify how fuzzy the borders are between clusters. Many clustering metrics exist to answer this question (Lim and Qiu 2024) - one popular metric is the average silhouette distance.

Celltypes <- srobj@meta.data$Celltype
pca_embeddings <- Embeddings(srobj, reduction = 'pca')

# Calculate silhouette widths
# sil_widths <- silhouette(x = cluster_assignments, dist = dist(pca_embeddings))
sil_scores <- silhouette(x = as.numeric(Celltypes), dist = dist(pca_embeddings))

# Extract silhouette scores
silhouette_data <- as.data.frame(sil_scores)
# Recover cell type names
silhouette_data$Celltype <- as.character(Celltypes)

silhouette_arranged <- silhouette_data %>% group_by(Celltype) %>% arrange(-sil_width)

silhouette_averages <- silhouette_arranged %>% summarize(avg = mean(sil_width))

avg_silhouettes_plot <- ggplot(silhouette_averages, aes(y = Celltype, x = avg, fill = Celltype, group = Celltype)) +
    geom_bar(stat = "identity", position = 'dodge2') +
    theme_minimal() +
    labs(title = "Average Silhouettes",
        y = "Cluster",
        x = "Average Silhouette width",
        fill = "Cluster") +
    theme(axis.text.y = element_text(hjust = 1, vjust = 0.5, size = 20),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.y = element_text(size = 20),
        legend.position = "None") +
    scale_fill_manual(values = pal)

avg_silhouettes_plot

overall_average <- silhouette_arranged %>% ungroup %>% summarize(ave = as.numeric(mean(sil_width))) %>% pull(ave)

full_silhouettes_plot <- ggplot(silhouette_arranged, aes(y = Celltype, x = sil_width, fill = Celltype, group = Celltype)) +
    geom_bar(stat = "identity", position = 'dodge2') +
    geom_vline(xintercept = overall_average,
               color = 'red',
               linetype = 'longdash') +
    theme_minimal() +
    labs(title = "Silhouette Analysis",
        y = "Cluster",
        x = "Silhouette width",
        fill = "Cluster") +
    theme(axis.text.y = element_text(hjust = 1, vjust = 0.5, size = 20),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.y = element_text(size = 20),
        legend.position = "None") +
    scale_fill_manual(values = pal)

full_silhouettes_plot

With ARBOL, we can build a dendrogram of cluster centroid distances based on gene expression or some dimensionality reduction in the data.

# ARBOL requires 3 metadata entries
srobj@meta.data$tierNident <- srobj@meta.data$Celltype
srobj@meta.data$CellID <- row.names(srobj@meta.data)
srobj@meta.data$sample <- srobj@meta.data$`Sample ID`

srobj <- ARBOLcentroidTaxonomy(srobj, 
                               tree_reduction='pca'
                               )
Bootstrap (r = 0.48)... Done.
Bootstrap (r = 0.6)... Done.
Bootstrap (r = 0.68)... Done.
Bootstrap (r = 0.8)... Done.
Bootstrap (r = 0.88)... Done.
Bootstrap (r = 1.0)... Done.
Bootstrap (r = 1.08)... Done.
Bootstrap (r = 1.2)... Done.
Bootstrap (r = 1.28)... Done.
Bootstrap (r = 1.4)... Done.
ggraph(srobj@misc$tax_ggraph, layout = 'dendrogram', height=plotHeight) +
  geom_edge_elbow() +
  geom_node_text(aes(filter = leaf, label = name, color = name), nudge_y=4,vjust=0.5,hjust=0,size=5) +
  scale_color_manual(values = pal) +
  geom_node_text(aes(filter = leaf, label = n),color='grey30',nudge_y=1,vjust=0.5,hjust=0,size=4) + 
  theme_void() +
  new_scale('color') +
  geom_node_point(aes(filter = leaf, color=sample_diversity),size=4,shape='square') + 
  scale_color_gradient(low='grey90',high='grey10',limits=c(0,1)) +
  coord_flip() + scale_y_reverse() +
  expand_limits(y=-20)

References

Lim, Hong Seo, and Peng Qiu. 2024. “Quantifying the Clusterness and Trajectoriness of Single-Cell RNA-Seq Data.” PLOS Computational Biology 20 (2): e1011866. https://doi.org/10.1371/journal.pcbi.1011866.